library(plotly)
library(data.table)
library(tidyr)
library(knitr)
library(heatmaply)

Preprocessing

  • Load data file
  • rename genres for better readability
    • “Religion, Spirituality & New Age” to “Religion”
    • “Science.fiction” to “SciFi”
    • “Action.and.Adventure” to “Action”

All genres:

books_mat <- read.csv(file_bookstore, row.names = 1)
rownames(books_mat) <- make.names(rownames(books_mat))
rownames(books_mat) <- sub("Science.fiction", "SciFi", rownames(books_mat))
rownames(books_mat) <- sub("Action.and.Adventure", "Action", rownames(books_mat))
rownames(books_mat) <- sub("Religion..Spirituality...New.Age", 
    "Religion", rownames(books_mat)
)
colnames(books_mat) <- rownames(books_mat)
rownames(books_mat)
 [1] "Satire"        "SciFi"         "Drama"         "Action"        "Romance"       "Mystery"       "Horror"       
 [8] "Self.help"     "Health"        "Guide"         "Travel"        "Children.s"    "Religion"      "Science"      
[15] "History"       "Math"          "Anthology"     "Poetry"        "Encyclopedias" "Dictionaries"  "Comics"       
[22] "Art"           "Cookbooks"     "Diaries"       "Journals"     
  • Check if upper and lower triangle identical
is_upper_lower <- identical(
    books_mat[upper.tri(books_mat)], 
    t(books_mat)[upper.tri(books_mat)]
)
is_upper_lower
[1] TRUE
  • Transform to long and tidy data.table
books_dt <- as.data.table(books_mat, keep.rownames = TRUE)
setnames(books_dt, c('genreA',colnames(books_mat)))
books_dt <- as.data.table(gather(books_dt, genreB, customers, Satire:Journals))
head(books_dt)
  • Average number of genres per customer
sum(books_dt[genreA==genreB, customers])/total_customers
[1] 2.332187

First ideas

Show me everything!

hm <- heatmapr(books_mat)
heatmaply(hm, 
    plot_method = 'plotly', 
    colors =  c('grey95', 'dodgerblue')
)
  • Romance, SciFi, Action, History are most bought
  • bought-together clusters:
    • Romance, SciFi, Action, History
    • Dictionaries and Comics
    • Math and Poetry
  • Mystery is an outlier

Most bought genre

plot_ly(data=books_dt[genreA==genreB][order(customers)], 
    x=~genreA, y=~customers, type="bar"
)%>% layout(
    margin=list(b=100), 
    xaxis=list(categoryorder="trace"),
    title="Most bought genre"
)

Best pairs

all_genres <- unique(books_dt$genreA)
all_pairs <- combn(all_genres, 2, simplify = F)
pair_customers <- 
pair_dt <- data.table(
    genre_pairs = sapply(all_pairs, function(p){
        paste(sort(p), collapse = "&")}
    ),
    pair_customers = sapply(all_pairs, function(p){
        books_dt[genreA==p[1] & genreB==p[2], customers]
    })
)
plot_ly(data=pair_dt[order(pair_customers, decreasing=T)][1:10], type='bar', 
    x=~genre_pairs, y=~pair_customers
)%>% layout(
    margin=list(b=100), 
    xaxis=list(categoryorder="trace"),
    title="Top 10 genre pairs"
)
  • mostly combinations of most bought genres

Special genres

Hypothesis

  • If a customer buys more than 2 genres, he is recorded in more than 1 off-diagonal entry:
    • (2*diagonal - colSum) < 0
  • If a genre is bought more often alone than in triplets (or higher):
    • (2*diagonal - colSum) > 0

Look for customers that buy only one genre

  • Compare column sum and 2*diagonal value
  • generate table with {genre, {2*diagonal-colSum}}
all_genres <- unique(books_dt$genreA)
selective_dt <- data.table()
for(g in all_genres){
    d <- books_dt[genreA==g & genreB==g, customers]
    cs <- sum(books_dt[genreA==g, customers])
    dd <- I(2*d - cs)
    selective_dt <- rbind(selective_dt, data.table(genre=g, diag_diff=dd))
}
p_sel <- plot_ly(
    data=selective_dt[order(diag_diff)], 
    y=~genre, x=~diag_diff, type="bar", 
    color = ~diag_diff>0, colors=c("gray", "darkgreen")
)%>% layout(
    margin=list(l=100), 
    yaxis=list(categoryorder="trace", title=''),
    xaxis=list(title='2*diagonal - columnSum'),
    title="Which genres are bought alone?"
)
show(p_sel)
  • Mystery and Horror are mostly bought alone
  • Satire and Travel rather bought in pairs

Normalize columns by diagonal

books_dt[,
    rel_customers:= (customers/books_dt[genreA==genreB, customers]), 
    by=genreB
    ]
head(books_dt[order(genreA)])

–> genreB relative to genreA-diagonal value

Look at all data unsorted: No pattern.

plot_ly(data=books_dt) %>%
    add_heatmap(
        z=~rel_customers, x=~genreA, y=~genreB, colors= c('grey95', 'dodgerblue')
    ) %>%
    layout(
        margin=list(b=110, l=110)
    )

With clustering of rows and columns (Note: they are different now):

  • 2 hubs on genreA axis (top dendro)
    • Art, Journals, Action, SciFi, History
    • Encyclopedias, Comics, Disctionaries, Poetry, Math, Anthology
    • e.g. genres that were bought with Art were also bought together with Journals
  • 2 hubs on genreB axis (right dendro)
    • Romance, History, Action, SciFi –> Romance instead of Art and Journals
    • same
  • bought with everything else? Romance

Most favorite partner genre

–> about 20% customers additionally bought SciFi and Romance

Relative best pairs

–> Math is poetry and History is Science fiction!

---
title: "Allianz DataViz Challenge"
author: "Daniel Bader"
output:
  html_document:
    toc: yes
    toc_float: yes
  html_notebook:
    toc: yes
    toc_float: yes
---

```{r, message=FALSE, echo=T}
library(plotly)
library(data.table)
library(tidyr)
library(knitr)
library(heatmaply)
```


```{r, echo=FALSE}
opts_chunk$set(echo=FALSE, cache=F)
total_customers <- 195387
file_bookstore <- file.path("~/Downloads/toydata/book_genres_data.csv")
source("build_book_store.R")
```


# Preprocessing

* Load data file
* rename genres for better readability
    * "Religion, Spirituality & New Age" to "Religion"
    * "Science.fiction" to "SciFi"
    * "Action.and.Adventure" to "Action"
    
All genres:
```{r}
books_mat <- read.csv(file_bookstore, row.names = 1)
rownames(books_mat) <- make.names(rownames(books_mat))
rownames(books_mat) <- sub("Science.fiction", "SciFi", rownames(books_mat))
rownames(books_mat) <- sub("Action.and.Adventure", "Action", rownames(books_mat))
rownames(books_mat) <- sub("Religion..Spirituality...New.Age", 
    "Religion", rownames(books_mat)
)
colnames(books_mat) <- rownames(books_mat)
rownames(books_mat)
```

* Check if upper and lower triangle identical

```{r}
is_upper_lower <- identical(
    books_mat[upper.tri(books_mat)], 
    t(books_mat)[upper.tri(books_mat)]
)
is_upper_lower
```

* Transform to long and tidy `data.table`

```{r}
books_dt <- as.data.table(books_mat, keep.rownames = TRUE)
setnames(books_dt, c('genreA',colnames(books_mat)))
books_dt <- as.data.table(gather(books_dt, genreB, customers, Satire:Journals))
```

```{r, echo=T}
head(books_dt)
```


* Average number of genres per customer

```{r}
sum(books_dt[genreA==genreB, customers])/total_customers
```


# First ideas

## Show me everything!

```{r, fig.width=8, fig.height=8}
hm <- heatmapr(books_mat)
heatmaply(hm, 
    plot_method = 'plotly', 
    colors =  c('grey95', 'dodgerblue')
)
```

* Romance, SciFi, Action, History are most bought 
* bought-together clusters:
    * Romance, SciFi, Action, History
    * Dictionaries and Comics
    * Math and Poetry
* Mystery is an outlier

## Most bought genre

```{r}
plot_ly(data=books_dt[genreA==genreB][order(customers)], 
    x=~genreA, y=~customers, type="bar"
)%>% layout(
    margin=list(b=100), 
    xaxis=list(categoryorder="trace"),
    title="Most bought genre"
)
```

## Best pairs

```{r}
all_genres <- unique(books_dt$genreA)
all_pairs <- combn(all_genres, 2, simplify = F)
pair_customers <- 
pair_dt <- data.table(
    genre_pairs = sapply(all_pairs, function(p){
        paste(sort(p), collapse = "&")}
    ),
    pair_customers = sapply(all_pairs, function(p){
        books_dt[genreA==p[1] & genreB==p[2], customers]
    })
)
plot_ly(data=pair_dt[order(pair_customers, decreasing=T)][1:10], type='bar', 
    x=~genre_pairs, y=~pair_customers
)%>% layout(
    margin=list(b=100), 
    xaxis=list(categoryorder="trace"),
    title="Top 10 genre pairs"
)
```

* mostly combinations of most bought genres


# Special genres

Hypothesis

* If a customer buys more than 2 genres, 
he is recorded in more than 1 off-diagonal entry:
    * (2*diagonal - colSum) < 0
* If a genre is bought more often alone than in triplets (or higher): 
    * (2*diagonal - colSum) > 0


Look for customers that buy only one genre

* Compare `column sum` and  `2*diagonal value`
* generate table with `{genre, {2*diagonal-colSum}}`


```{r, warning=FALSE, fig.width=8}
all_genres <- unique(books_dt$genreA)
selective_dt <- data.table()
for(g in all_genres){
    d <- books_dt[genreA==g & genreB==g, customers]
    cs <- sum(books_dt[genreA==g, customers])
    dd <- I(2*d - cs)
    selective_dt <- rbind(selective_dt, data.table(genre=g, diag_diff=dd))
}

p_sel <- plot_ly(
    data=selective_dt[order(diag_diff)], 
    y=~genre, x=~diag_diff, type="bar", 
    color = ~diag_diff>0, colors=c("gray", "darkgreen")
)%>% layout(
    margin=list(l=100), 
    yaxis=list(categoryorder="trace", title=''),
    xaxis=list(title='2*diagonal - columnSum'),
    title="Which genres are bought alone?"
)

show(p_sel)
```
* Mystery and Horror are mostly bought alone
* Satire and Travel rather bought in pairs



# Normalize columns by diagonal

```{r}
books_dt[,
    rel_customers:= (customers/books_dt[genreA==genreB, customers]), 
    by=genreB
    ]
head(books_dt[order(genreA)])
```

--> genreB relative to genreA-diagonal value 

Look at all data unsorted: No pattern.

```{r, fig.width=8, fig.height=8}
plot_ly(data=books_dt) %>%
    add_heatmap(
        z=~rel_customers, x=~genreA, y=~genreB, colors= c('grey95', 'dodgerblue')
    ) %>%
    layout(
        margin=list(b=110, l=110)
    )
```

With clustering of rows and columns (Note: they are different now):

```{r, fig.width=8, fig.height=8}
books_relmat <- dcast(books_dt, genreA ~ genreB, value.var = "rel_customers")
books_relmat <- as.matrix(books_relmat[,genreA:=NULL])
rownames(books_relmat) <- colnames(books_relmat)

hmrel <- heatmapr(t(books_relmat), k_col=3, k_row=3)
heatmaply(
    x=hmrel, 
    plot_method = 'plotly', 
    colors =  c('grey95', 'dodgerblue'),
    xlab='genreA', ylab='genreB'
) %>% layout(
    title='Customers of genreB relative to genreA',
    margin=list(t=50)
)
```

* 2 hubs on genreA axis (top dendro)
    * Art, Journals, Action, SciFi, History
    * Encyclopedias, Comics, Disctionaries, Poetry, Math, Anthology
    * e.g. genres that were bought with Art were also bought together with Journals
* 2 hubs on genreB axis (right dendro)
    * Romance, History, Action, SciFi --> Romance instead of Art and Journals
    * same
* bought with everything else? Romance


## Most favorite partner genre

```{r}
plot_ly(
    data=books_dt[,median(rel_customers), by=genreB][order(V1, decreasing = T)]
    )%>%
    add_bars(x=~genreB, y=~V1)%>%
    layout(
        yaxis=list(title='Median relative customers'),
        xaxis=list(categoryorder='trace'),
        margin=list(b=100)
    )
```

--> about 20% customers additionally bought SciFi and Romance


## Relative best pairs

```{r}
plot_ly(
        data = books_dt[genreA != genreB][order(rel_customers, decreasing = T)][1:10]
    ) %>% 
    add_bars(
        x=~paste0(genreA, "&", genreB), y=~rel_customers
        ) %>% 
    layout(
        margin=list(b=100, r=80), 
        xaxis=list(categoryorder="trace", title=''),
        yaxis=list(exponentformat='none'),
        title="Top 10 relative genre pairs"
)
```

--> **Math is poetry and History is Science fiction!**
